# Install packages not yet installed
packages <- c("dplyr", "pls", "hyperSpec", "prospectr", "mdatools", "ggplot2", "viridis", "mgcv", "caret", "MuMIn", "gglm", "gratia", "tidyr", "ggpubr", "corrplot")
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
utils::install.packages(pkgs = packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE)) # load all packages in list
# install packages not on CRAN
if (!require("remotes")) install.packages("remotes")
rm(installed_packages) # remove objects from environment
rm(packages)
# all 3 FT-NIRS scans, no preprocessing or wavenumber filters
dfmeta_LPW <- readRDS("LPW_dfmeta.RDS")
# average of scans 2 & 3, no preprocessing
scan_2 <- dfmeta_LPW %>% dplyr::filter(run_number == 2)
scan_3 <- dfmeta_LPW %>% dplyr::filter(run_number == 3)
scan_avg_raw <- bind_cols(NULL, scan_2[, 1:20])
scan_avg_raw <- bind_cols(scan_2[, 1:20], (scan_2[, 21:ncol(scan_2)] + scan_3[, 21:ncol(scan_3)]) / 2) # average for all absorbance measurements
scan_avg_raw <- scan_avg_raw[complete.cases(scan_avg_raw$read_age), ]
rm(scan_2, scan_3)
# preprocess and remove wavenumbers > 7500
scan_avg <- cbind(scan_avg_raw[, c(1:20)], savitzkyGolay(scan_avg_raw[, 21:length(scan_avg_raw)], m = 1, p = 3, w = 17))
scan_avg <- scan_avg[, -c(21:517)]
# long_format for plotting
scan_avg_raw_long <- pivot_longer(scan_avg_raw, cols = `11536`:`3952`, names_to = "name", values_to = "value")
scan_avg_raw_long$name <- as.numeric(scan_avg_raw_long$name)
scan_avg_long <- pivot_longer(scan_avg, cols = `7496`:`4016`, names_to = "name", values_to = "value")
scan_avg_long$name <- as.numeric(scan_avg_long$name)
### Figure 1 - Sample date vs length
# plot of specimens used vs sample date to demonstrate fish got larger over time
ggplot(scan_avg %>% dplyr::filter(read_age != 175, read_age != 135, specimen != 65, read_age != 147, read_age != 181, read_age != 161, read_age != 188),
aes(x = sample_date, y = length)) +
geom_point(size = 3) +
xlab("Sample Date") +
ylab("Fork length (mm)") +
theme_bw() +
geom_smooth(method = "lm", col = "black") +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 15))
## `geom_smooth()` using formula = 'y ~ x'
### Figure 2 - raw vs preprocessed spectra, outliers removed
raw_spec <- ggplot(
scan_avg_raw_long %>% dplyr::filter(read_age != 175, read_age != 135, specimen != 65, read_age != 147, read_age != 181, read_age != 161, read_age != 188),
aes(x = name, y = value, group = specimen, color = read_age)
) +
geom_path() +
scale_x_reverse() +
scale_color_viridis() +
labs(
color = "Age (days)",
y = "Raw absorbance", x = ""
) +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.text = element_text(size = 10),
legend.title = element_text(size = 17)
) +
geom_vline(xintercept = 7500, col = "red", linewidth = 2, linetype = "dashed")
proc_spec <- ggplot(
scan_avg_long %>%
dplyr::filter(read_age != 175, read_age != 135, specimen != 65, read_age != 147, read_age != 181, read_age != 161, read_age != 188),
aes(x = name, y = value, group = specimen, color = read_age)
) +
geom_path() +
scale_x_reverse() +
scale_color_viridis() +
labs(color = "Age (days)") +
labs(y = "Preprocessed absorbance", x = expression(paste("Wavenumber ", cm^-1))) +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.text = element_text(size = 10, vjust = .2),
legend.title = element_text(size = 15, vjust = 1)
)
ggarrange(raw_spec, proc_spec, ncol = 1, common.legend = T)
rm(raw_spec, proc_spec, scan_avg_long, scan_avg_raw_long, scan_avg_raw)
#### Figure 3 - Correlation plot of wavenumbers 5264 - 4016 cm-1
corrplot(cor(scan_avg[, 300:456]), tl.pos='n', cl.cex = 1.8)
#### Figure 3 - PCA and outliers removed ###
# Plot PCA, length by color, outliers removed triangles
pcs <- prcomp(scan_avg[, 21:ncol(scan_avg)])
pcs <- pcs$x[, 1:2]
scan_avg <- cbind(pcs, scan_avg)
rm(pcs)
# specimens removed from dataset
specimens <- scan_avg %>%
dplyr::filter(read_age == 175 | read_age == 135 | specimen == 65 | read_age == 147 | read_age == 181 | read_age == 161 | read_age == 188) %>%
select(specimen)
ggplot() +
geom_point(data = scan_avg[!scan_avg$specimen %in% specimens$specimen, ], aes(x = PC1, y = PC2, color = read_age), size = 3.5) +
scale_color_viridis() +
theme_bw() +
theme(
axis.title = element_text(size = 25),
axis.text = element_text(size = 20),
legend.text = element_text(size = 15, color = "black"),
legend.title = element_text(size = 18),
legend.position.inside = c(0.9, 0.8)
) +
labs(color = "Age (days)") +
geom_point(data = scan_avg[scan_avg$specimen %in% specimens$specimen, ], aes(x = PC1, y = PC2, color = read_age), shape = 17, size = 4)
rm(specimens)
# remove outliers from dataframe
scan_avg_filter <- scan_avg %>%
dplyr::filter(read_age != 175, read_age != 135, specimen != 65, read_age != 147, read_age != 181, read_age != 161, read_age != 188)
scan_avg_filter <- scan_avg_filter[, -c(1:2)] # remove PC columns
# 10 fold CV split - create folds
set.seed(6)
splits <- caret::createFolds(scan_avg_filter$read_age, k = 10, list = TRUE, returnTrain = FALSE)
# extract PC's for each calibration set, create test sets with ages and spectra
cal <- list()
test <- list()
for (i in 1:10) {
# calibration set and PC's
pc.mod <- preProcess(scan_avg_filter[-splits[[i]], -c(1:20)], method = "pca", thresh = 0.95, pcaComp = 4)
pc.cal <- predict(pc.mod, scan_avg_filter[-splits[[i]], -c(1:20)])
pc.cal <- cbind(pc.cal, scan_avg_filter[-splits[[i]], ])
cal[[i]] <- pc.cal
# test sets
pc.test <- predict(pc.mod, scan_avg_filter[splits[[i]], -c(1:20)])
pc.test <- cbind(pc.test, scan_avg_filter[splits[[i]], ])
test[[i]] <- pc.test
}
rm(pc.cal, pc.test, pc.mod, scan_avg)
# store metrics from each fold and each model type
RMSE.age <- list()
r2.age <- list()
AIC.age <- list()
AICc.age <- list()
table(unlist(mod.sel))
##
## (Intercept) PC1 PC2 PC3 PC4
## 10 10 4 10 5
# PC 2 may be uninformative, will leave out of lm(). PC4 only used in half, also possibly not as informative
# lm() with 10-fold CV
lm.mods <- list()
for (i in 1:10) {
calibrate <- cal[[i]]
testing <- test[[i]]
mod <- lm(data = calibrate, read_age ~ PC1 + PC3 + PC4)
RMSE.age$lm.cal[i] <- RMSE(pred = mod$fitted.values, obs = calibrate[, 15])
preds <- predict(mod, newdata = testing)
RMSE.age$lm.test[i] <- RMSE(pred = preds, obs = testing[, 15])
r2.age$lm.cal[i] <- summary(mod)$r.squared
RSS <- sum((testing$read_age - preds)^2)
TSS <- sum((testing$read_age - mean(testing$read_age))^2)
r2.age$lm.test[i] <- 1 - (RSS / TSS)
AIC.age$lm[i] <- AIC(mod)
AICc.age$lm[i] <- AICc(mod)
lm.mods[[i]] <- mod
}
RMSE.age$lm.cal <- mean(RMSE.age$lm.cal)
RMSE.age$lm.test <- mean(RMSE.age$lm.test)
r2.age$lm.cal <- mean(r2.age$lm.cal)
r2.age$lm.test <- mean(r2.age$lm.test)
# GAM with 10 fold CV, select = T allows PCs to be penalized and effectively removed from model if appropriate.
GAM.mods <- list()
for (i in 1:10) {
calibrate <- cal[[i]]
testing <- test[[i]]
mod <- gam(data = calibrate, read_age ~ s(PC1) + s(PC2) + s(PC3) + s(PC4), method = "REML")
# Extract AIC, AICc, RMSE (cal & test) & r2 (cal & test)
RMSE.age$GAM.cal[i] <- caret::RMSE(pred = mod$fitted.values, obs = calibrate[, 13])
preds <- predict(mod, newdata = testing)
RMSE.age$GAM.test[i] <- caret::RMSE(pred = preds, obs = testing[, 15])
r2.age$gam.cal[i] <- summary(mod)$r.sq
RSS <- sum((testing$read_age - preds)^2)
TSS <- sum((testing$read_age - mean(testing$read_age))^2)
r2.age$gam.test[i] <- 1 - (RSS / TSS)
AIC.age$gam[i] <- AIC(mod)
AICc.age$gam[i] <- AICc(mod)
GAM.mods[[i]] <- mod
}
r2.age$gam.cal <- mean(r2.age$gam.cal)
r2.age$gam.test <- mean(r2.age$gam.test)
RMSE.age$GAM.cal <- mean(RMSE.age$GAM.cal)
RMSE.age$GAM.test <- mean(RMSE.age$GAM.test)
mod.summary <- data.frame(
model = c("lm_cal", "lm_test", "gam_cal", "gam_test"),
r2 = 1:4,
RMSE = 1:4
)
mod.summary$r2 <- unlist(r2.age)
mod.summary$RMSE <- unlist(RMSE.age)
AIC.summary <- data.frame(
model = c(rep("lm", 10), rep("gam", 10)),
AIC = 1:20,
AICc = 1:20
)
# AIC.age <- AIC.age[-3]
AIC.summary$AIC <- unlist(AIC.age)
AIC.summary$AICc <- unlist(AICc.age)
# LM
mean(AIC.summary[1:10, 2]) # AIC
## [1] 380.8295
mean(AIC.summary[1:10, 3]) # AICc
## [1] 382.2382
# GAM
mean(AIC.summary[11:20, 2]) # AIC
## [1] 368.9412
mean(AIC.summary[11:20, 3]) # AICc
## [1] 378.0549
rm(r2.age, AIC.age, AICc.age, calibrate, mod, mod.sel, pctest, i, preds, RSS, TSS, testing, temp)
# Figure 4 - model performance plots
actual <- list()
pred_lm <- list()
pred_gam <- list()
# create dataframe with predictions vs actual age
for (i in 1:10) {
actual[[i]] <- test[[i]]$read_age
pred_lm[[i]] <- predict(lm.mods[[i]], test[[i]])
pred_gam[[i]] <- predict(GAM.mods[[i]], test[[i]])
}
predictions <- data.frame(
actual = rep(0, 54),
pred_lm = rep(0, 54),
pred_gam = rep(0, 54)
)
predictions$actual <- unlist(actual)
predictions$pred_lm <- unlist(pred_lm)
predictions$pred_gam <- unlist(pred_gam)
rm(actual, pred_lm, pred_gam)
# LM model performance
RMSE.age$lm.test <- mean(RMSE.age$lm.test)
r2lmlab <- paste("r2 =", round(mod.summary[2, 2], 3))
rmselmlab <- paste("RMSE = ", round(RMSE.age$lm.test, 3))
plot4 <- ggplot() +
theme_bw() +
geom_point(data = predictions, aes(x = actual, y = pred_lm), size = 3) +
geom_abline(slope = 1, linewidth = 1.25, col = "red", linetype = "longdash") +
xlab("Age (days)") +
ylab("Predicted age (days)") +
geom_text(aes(x = 184, y = 130), size = 4, label = r2lmlab) +
geom_text(aes(x = 180, y = 124), size = 4, label = rmselmlab) +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 20)
) +
ggtitle("MLR")
# GAM model performance
r2gamlab <- paste("r2 =", round(mod.summary[4, 2], 3))
rmsegamlab <- paste("RMSE = ", round(RMSE.age$GAM.test, 3))
plot5 <- ggplot() +
theme_bw() +
geom_point(data = predictions, aes(x = actual, y = pred_gam), size = 3) +
geom_abline(slope = 1, linewidth = 1.25, col = "red", linetype = "longdash") +
xlab("Age (days)") +
ylab("Predicted age (days)") +
geom_text(aes(x = 184, y = 125), size = 4, label = r2gamlab) +
geom_text(aes(x = 180, y = 119), size = 4, label = rmsegamlab) +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 20)
) +
ggtitle("GAM")
plot4 + plot5
# Appendix
# Using test/cal split # 5 for model performance appendix, but all 10 are available if interested
# ggplot version of lm diagnostics
gglm(lm.mods[[5]], theme = theme_bw(), theme(plot.title = element_text(size = 18), axis.title = element_text(size = 16),axis.text = element_text(size = 14)) )
# GAM diagnostics
appraise(GAM.mods[[5]]) & theme_bw() &
theme(plot.title = element_text(size = 15), axis.title = element_text(size = 14),axis.text = element_text(size = 12))
# GAM partial effects
draw(GAM.mods[[5]],residuals = T) & theme_bw() &
theme(plot.title = element_text(size = 15), axis.title = element_text(size = 14),axis.text = element_text(size = 12))